Report 2

Jason Ola

2021-07-19

Import packages

library(tidyverse)
library(tidygraph)
library(ggraph)

Part 1 - Getting the data

movies_data <-  read_delim("data/Movies.paj", delim = " ") 
movies_nodes <- slice(movies_data, 2:103) %>% 
  rename("node_id" = "*Network",
         "name" = "Movies.net",
         "mode" = "[2-Mode]") %>% 
  select(-mode)
movies_edges <- slice(movies_data, 106:297) %>% 
    rename("from" = "*Network",
         "to" = "Movies.net",
         "n_collabs" = "[2-Mode]")

Cleaning the collumn classes

movies_nodes <- movies_nodes %>% 
  mutate(node_id = as.integer(node_id))
movies_edges <- movies_edges %>% 
  mutate(from = as.integer(from),
         to = as.integer(to))

Creating the graph

directed_tree <- tbl_graph(nodes = movies_nodes, 
                           edges = movies_edges, 
                           directed = T)
n_nodes <- directed_tree %>% 
  as_tibble() %>% 
  count() %>% 
  pull()

There are 102 nodes

Filter out the isolated node and separate producers from composers

directed_tree <- directed_tree %N>% 
  mutate(role = if_else(node_id < 63, "Producer", "Composer"))%N>% 
  filter(!node_is_isolated())

Part 2

Do an initial plot

directed_tree %>% 
  ggraph()+
  geom_edge_link(arrow = arrow(length = unit(2,"mm")),
                 end_cap = circle(2,"mm"),
                 start_cap = circle(2,"mm"),
                 color = "#FDFFFC",
                 width = 0.1)+
  geom_node_text(aes(label = node_id,
                     color = role),
                 angle = 90,
                 size = 1.8)+
  scale_color_manual(values = c("#2EC4B6","#E71D36"))+
  labs(title = "Initial plot")+
  theme_graph()+
  theme(legend.title = element_blank(),
        plot.background = element_rect(fill = "#011627"),
        text = element_text(color = "#FDFFFC"),
        plot.title = element_text(color = "#FDFFFC"))

Here is a table with corresponding names to the IDs

directed_tree %N>% 
  as_tibble() %>% 
  rename("ID" = "node_id",
         "Name" = "name",
         "Role" = "role") %>% 
  DT::datatable(rownames = FALSE) 

Part 3

Producers that worked with the same composers

name_id <- directed_tree %N>% 
  as_tibble() %>% 
  select(node_id, name)
producer_tree <- directed_tree %E>%
  as_tibble() %>% 
  select(from,to) %>% 
  group_by(to) %>% 
  #get all the ids from composers and put them in a list
  mutate(new_from = list(unique(from))) %>% 
  #unnest the list
  unnest_longer(new_from)%>% 
  #filter out the loops
  filter(from != new_from) %>% 
  ungroup() %>% 
  select(from = new_from, to = from)%>% 
  as_tbl_graph() %N>% 
  select(node_id = name) %>% 
  mutate(node_id = as.integer(node_id)) %>% 
  left_join(name_id, by = "node_id") %>% 
  #Make it undirected, there is no hierarchy.
  to_undirected()
producer_tree %>% 
  ggraph()+
  geom_edge_link(width = 0.1,
                 color = "#FDFFFC",
                 alpha = 0.5)+
  geom_node_point(size = 0.5,
                  color = "#FDFFFC")+
  labs(title = "Initial plot",
       subtitle = "Which producers worked with the same composers ?")+
  theme_graph()+
  theme(plot.background = element_rect(fill = "#011627"),
        plot.title = element_text(color = "#FDFFFC"),
        plot.subtitle = element_text(color = "#FDFFFC"))

Part 4

Let’s try some clustering

producer_tree %>% 
  mutate(Group = group_leading_eigen() %>% as.factor()) %>% 
  ggraph()+
  geom_edge_link(width = 0.1,
                 color = "#FDFFFC",
                 alpha = 0.2)+
  geom_node_point(aes(color = Group),
                  size = 0.8)+
  labs(title = "Leading eigen clustering algorithm",
       subtitle = "Which producers worked with the same composers ?")+
  scale_color_manual(values = c("#2EC4B6","#E71D36","green"))+
  theme_graph()+
  theme(plot.background = element_rect(fill = "#011627"),
        plot.title = element_text(color = "#FDFFFC"),
        plot.subtitle = element_text(color = "#FDFFFC"),
        text = element_text(color = "white"))

producer_tree %>% 
  mutate(Group = group_louvain() %>% as.factor()) %>% 
  ggraph()+
  geom_edge_link(width = 0.1,
                 color = "#FDFFFC",
                 alpha = 0.2)+
  geom_node_point(aes(color = Group),
                  size = 0.8)+
  labs(title = "Louvain clustering algorithm",
       subtitle = "Which producers worked with the same composers ?")+
  scale_color_manual(values = c("#2EC4B6","#E71D36","green"))+
  theme_graph()+
  theme(plot.background = element_rect(fill = "#011627"),
        plot.title = element_text(color = "#FDFFFC"),
        plot.subtitle = element_text(color = "#FDFFFC"),
        text = element_text(color = "white"))

producer_tree %>% 
  mutate(Group = group_spinglass() %>% as.factor()) %>% 
  ggraph()+
  geom_edge_link(width = 0.1,
                 color = "#FDFFFC",
                 alpha = 0.2)+
  geom_node_point(aes(color = Group),
                  size = 0.8)+
  labs(title = "Spinglass clustering algorithm",
       subtitle = "Which producers worked with the same composers ?")+
  theme_graph()+
  theme(plot.background = element_rect(fill = "#011627"),
        plot.title = element_text(color = "#FDFFFC"),
        plot.subtitle = element_text(color = "#FDFFFC"),
        text = element_text(color = "white"))

I’ve computed the plots for 3 different clustering algorithms and ran them multiple times to see if they change output. We see in the spinglass algorithm that it tends to divide the graph into more subgroups, here 5, sometimes 6, it tries to catch outliers together in groups, but the output is often different when we rerun it. For the leading eigen algorithm, we see it divided the graph into 3 groups that are well separated, the output is consistent when we rerun it. As for the Louvain algorithm, we see it’s also consistent, however there are some green mixup among the blue cluster that we might not particularly want. I would choose the leading eigen algorithm for this graph analysis.

Part 5

Let’s touch up our graph a little more

producer_tree %>% 
  mutate(Group = group_leading_eigen() %>% as.factor()) %>% 
  ggraph()+
  ggforce::geom_mark_hull(aes(x,y,
                              group = Group, 
                              fill = Group),
                          alpha = 0.1,
                          concavity = 5,
                          size = 0.3)+
  geom_edge_link(width = 0.1,
                 color = "#FDFFFC",
                 alpha = 0.1)+
  geom_node_text(aes(label = node_id,
                     color = Group),
                  size = 2)+
  labs(title = "Leading eigen clustering algorithm",
       subtitle = "Which producers worked with the same composers ?",
       caption = "Source : sites.google.com/site/
       ucinetsoftware/datasets/hollywoodfilmmusic")+
  scale_color_manual(values = c("#2EC4B6","#E71D36","green"))+
  scale_fill_manual(values = c("#2EC4B6","#E71D36","green"))+
  theme_graph()+
  theme(plot.background = element_rect(fill = "#011627"),
        plot.title = element_text(color = "#FDFFFC"),
        plot.subtitle = element_text(color = "#FDFFFC"),
        text = element_text(color = "white"),
        plot.caption = element_text(color = "#FDFFFC",
                                    size = 8))